Please click on the link to jump to specific section
Risk analytics in banking and financial services and understand how data is used to minimise the risk of losing money while lending to customers
Two types of risks are associated with the bank's decision:
Below information can be taken from loan application
Four types of decisions that could be taken by client/company
1. Identify the variable which are strong indicator of default
2. These variable will be utilized in portfolio and risk assessment
# Filtering out the warnings
import warnings
warnings.filterwarnings('ignore')
# All library imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
# Reading csv and assign to variable
# For simplicity, let's call dataframes as 'application_df' and 'prev_application_df'
application_df = pd.read_csv('application_data.csv')
prev_application_df = pd.read_csv('previous_application.csv')
application_df.head()
prev_application_df.head()
application_df.shape
prev_application_df.shape
application_df.info()
prev_application_df.info()
# Numerical values
application_df.describe(include=[np.number])
# Numerical and categorical values
application_df.describe(include='all')
# Numerical values
prev_application_df.describe(include=[np.number])
# Numerical and Categorical values
prev_application_df.describe(include='all')
application_df.dtypes
# since we are unable to see all columns
for column in application_df:
print(column,'\t', application_df[column].dtypes)
# since we are unable to see all columns
for column in prev_application_df:
print(column,'\t', prev_application_df[column].dtypes)
From the observation, application_df has 124 columns, of which there are many irrelevant columns that may not be required for analysis and are off the objective of the analysis. In this section we will remove such columns first and then we will remove the rows which are insufficient for analysis
Example: EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3 and so on
# marking irrelevant columns (mostly 0 or no information) and removing it
irrelevant_columns = [i for i in range (41,122,1)]
application_df.drop(application_df.columns[irrelevant_columns], axis=1, inplace=True)
# verification
print(application_df.shape)
application_df.head()
prev_application_df.head()
application_df['AMT_GOODS_PRICE']= application_df['AMT_GOODS_PRICE'].apply(lambda x:round(x,2))
application_df['AMT_GOODS_PRICE']
# finding null values through out the dataframe
application_df.isnull().sum()
application_df.shape
# from the above 2 cells, we can see that column 'OCCUPATION_TYPE' and 'OWN_CAR_AGE' contains
#significantly amount of null value
# let's inspect 'OWN_CAR_AGE': Age of client's car
application_df['OWN_CAR_AGE'].isna().sum()
# seems like nan is for the clients who never own car, we may create new column 'HAS_OWN_CAR', as it will be
# useful to derive more insights
application_df['HAS_OWN_CAR'] = np.where(application_df['OWN_CAR_AGE'].isnull(), False, True)
# verification
application_df[application_df['HAS_OWN_CAR']==False]
# let's inspect 'OCCUPATION_TYPE'
application_df['OCCUPATION_TYPE'].isna().sum()
application_df['OCCUPATION_TYPE'].unique()
# from the above observation, null values are significantly high,
# seems like the client refrain to tell information, low chances of human error
# let's call all those null values as 'Others'
application_df['OCCUPATION_TYPE'].fillna('Others', inplace=True)
# verification
application_df['OCCUPATION_TYPE'].isna().sum()
application_df['OCCUPATION_TYPE'].unique()
# let's inspect column 'NAME_TYPE_SUITE': Who was accompanying client when he was applying for the loan
application_df['NAME_TYPE_SUITE'].unique()
application_df['NAME_TYPE_SUITE'].isna().sum()
# value is insignificantly low; let's see what are the most used values because its a categorical variable
application_df['NAME_TYPE_SUITE'].value_counts()
# most of the applications has 'NAME_TYPE_SUITE' as 'Unaccompanied', so let's fill it with 'Unaccompanied'
application_df['NAME_TYPE_SUITE'].fillna('Unaccompanied', inplace=True)
# verification
application_df['NAME_TYPE_SUITE'].unique()
application_df['NAME_TYPE_SUITE'].isna().sum()
# let inspect Column 'AMT_GOODS_PRICE': For consumer loans it is the price of the goods for which the loan is given
application_df['AMT_GOODS_PRICE'].isna().sum()
application_df[application_df['AMT_GOODS_PRICE'].isna() & application_df['TARGET']==1]
# only 21 clients are having difficulty in paying loans and all of the loans are revolving loans
# that is why there no value for AMT_GOODS_PRICE
# let's inspect AMT_ANNUITY
application_df[application_df['AMT_ANNUITY'].isnull()]
#since only insignificant amount of null values, so dropping off those
application_df = application_df[~application_df['AMT_ANNUITY'].isnull()]
# verification
application_df['AMT_ANNUITY'].isnull().sum()
# inspecting prev_application_df
prev_application_df.shape
prev_application_df.isnull().sum()
# value is insignificantly low; let's see what are the most used values because its a categorical variable
prev_application_df['NAME_TYPE_SUITE'].value_counts()
# most of the applications has 'NAME_TYPE_SUITE' as 'Unaccompanied', so let's fill it with 'Unaccompanied'
prev_application_df['NAME_TYPE_SUITE'].fillna('Unaccompanied', inplace=True)
# verification
prev_application_df['NAME_TYPE_SUITE'].unique()
prev_application_df['NAME_TYPE_SUITE'].isnull().sum()
# since most of the data missing from columns RATE_INTEREST_PRIMARY and RATE_INTEREST_PRIVILEGED,
# hence droping those columns
prev_application_df.drop(columns=['RATE_INTEREST_PRIMARY','RATE_INTEREST_PRIVILEGED'], inplace=True)
prev_application_df.head()
# since most of the data missing from columns
#DAYS_FIRST_DRAWING
#DAYS_FIRST_DUE
#DAYS_LAST_DUE_1ST_VERSION
#DAYS_LAST_DUE
#DAYS_TERMINATION
#NFLAG_INSURED_ON_APPROVAL
# hence droping these columns
prev_application_df.drop(columns = ['DAYS_FIRST_DRAWING','DAYS_FIRST_DUE','DAYS_LAST_DUE_1ST_VERSION',
'DAYS_LAST_DUE', 'DAYS_TERMINATION', 'NFLAG_INSURED_ON_APPROVAL'],
inplace = True)
prev_application_df.head()
# AMT_DOWN_PAYMENT and RATE_DOWN_PAYMENT are missing significantly and cannot be assume some value,
# hence dropping those columns
prev_application_df.drop(columns = ['AMT_DOWN_PAYMENT', 'RATE_DOWN_PAYMENT'],
inplace = True)
prev_application_df.head()
prev_application_df.isnull().sum()
prev_application_df.shape
# Columns AMT_ANNUITY, AMT_GOODS_PRICE, CNT_PAYMENT are almost 20% null
# removing those rows
prev_application_df = prev_application_df[~prev_application_df['AMT_ANNUITY'].isnull()]
prev_application_df = prev_application_df[~prev_application_df['AMT_GOODS_PRICE'].isnull()]
prev_application_df = prev_application_df[~prev_application_df['CNT_PAYMENT'].isnull()]
# verification
prev_application_df.isnull().sum()
prev_application_df.shape
application_df.shape
# let's inspect column AMT_INCOME_TOTAL from application_df
application_df.AMT_INCOME_TOTAL.describe().apply("{0:.2f}".format)
plt.figure(figsize=[20,5])
sns.boxplot(application_df.AMT_INCOME_TOTAL)
plt.show()
application_df.AMT_INCOME_TOTAL.quantile([0.05,0.10,0.5,0.7,0.85, 0.9,0.95, 0.99])
application_df[application_df.AMT_INCOME_TOTAL>337500].describe()
application_df[(application_df.AMT_INCOME_TOTAL>3375000.0) & (application_df.TARGET ==1)]
# from the above, we can remove clients above 95% as they are high earner and only one client is having
# difficulty paying the Installment
application_df = application_df[~(application_df.AMT_INCOME_TOTAL>3375000)]
#verification
plt.figure(figsize=[20,5])
sns.boxplot(application_df.AMT_INCOME_TOTAL)
plt.show()
# let's inspect column AMT_CREDIT from application_df
application_df.AMT_CREDIT.describe().apply("{0:.2f}".format)
plt.figure(figsize=[20,5])
sns.boxplot(application_df.AMT_CREDIT)
plt.show()
application_df.AMT_CREDIT.quantile([0.05,0.10,0.5,0.7,0.85, 0.9,0.95, 0.99])
application_df[(application_df.AMT_CREDIT>1854000.0) & (application_df.TARGET ==1)]
application_df[(application_df.AMT_CREDIT>1854000) & (application_df.TARGET ==0)]
# Although only 124 clients with extremly high credit,are having difficulty paying and 2950 have no issues .
# these are the credit amount, shouldn't be imputed
# let's inspect column AMT_ANNUITY from application_df
application_df.AMT_ANNUITY.describe().apply("{0:.2f}".format)
plt.figure(figsize=[20,5])
sns.boxplot(application_df.AMT_ANNUITY)
plt.show()
# let's inspect column AMT_GOODS_PRICE from application_df
application_df.AMT_GOODS_PRICE.describe().apply("{0:.2f}".format)
plt.figure(figsize=[20,5])
sns.boxplot(application_df.AMT_ANNUITY)
plt.show()
# AMT_GOODS_PRICE and AMT_ANNUITY are the factor that may affect the credit with difficulties;
# further analysis will be done in multivariate analysis section
# hence keeping all the records even outliers at this point of time
# Rounding off all the numerical values to 2 decimal and transforming FLAGS to Boolean type
application_df['AMT_INCOME_TOTAL'] = application_df['AMT_INCOME_TOTAL'].apply(lambda x:round(x,2))
application_df['AMT_CREDIT'] = application_df['AMT_CREDIT'].apply(lambda x:round(x,2))
application_df['AMT_ANNUITY'] = application_df['AMT_ANNUITY'].apply(lambda x:round(x,2))
application_df['AMT_GOODS_PRICE'] = application_df['AMT_GOODS_PRICE'].apply(lambda x:round(x,2))
prev_application_df['AMT_ANNUITY'] = prev_application_df['AMT_ANNUITY'].apply(lambda x:round(x,2))
prev_application_df['AMT_CREDIT'] = prev_application_df['AMT_CREDIT'].apply(lambda x:round(x,2))
prev_application_df['AMT_GOODS_PRICE'] = prev_application_df['AMT_GOODS_PRICE'].apply(lambda x:round(x,2))
application_df['FLAG_OWN_CAR'].unique()
application_df['FLAG_OWN_CAR'] = application_df['FLAG_OWN_CAR'].apply(lambda x : True if x=='Y' else False)
application_df['FLAG_OWN_CAR'].unique()
application_df['FLAG_OWN_CAR'].dtype
application_df['FLAG_OWN_REALTY'].unique()
application_df['FLAG_OWN_REALTY']=application_df['FLAG_OWN_REALTY'].apply(lambda x : True if x=='Y' else False)
application_df['FLAG_OWN_REALTY'].unique()
application_df['FLAG_OWN_CAR'].dtype
application_df['FLAG_OWN_REALTY'].unique()
application_df['AGE'] = application_df['DAYS_BIRTH'].apply(lambda x: round(abs(x/365)))
application_df['AGE']
application_df['TARGET'].value_counts()
application_df['TARGET'].value_counts(normalize=True)
application_df['TARGET'].value_counts(normalize=True).plot.barh()
plt.show()
application_df['CODE_GENDER'].value_counts()
application_df['CODE_GENDER'].value_counts(normalize=True)
application_df['CODE_GENDER'].value_counts(normalize=True).plot.barh()
plt.show()
application_df['FLAG_OWN_CAR'].value_counts()
application_df['FLAG_OWN_CAR'].value_counts(normalize=True)
application_df['FLAG_OWN_CAR'].value_counts(normalize=True).plot.barh()
plt.show()
application_df['FLAG_OWN_REALTY'].value_counts()
application_df['FLAG_OWN_REALTY'].value_counts(normalize=True)
application_df['FLAG_OWN_REALTY'].value_counts(normalize=True).plot.barh()
plt.show()
application_df['NAME_FAMILY_STATUS'].value_counts()
application_df['NAME_FAMILY_STATUS'].value_counts(normalize=True)
application_df['NAME_FAMILY_STATUS'].value_counts(normalize=True).plot.barh()
plt.show()
application_df['NAME_EDUCATION_TYPE'].value_counts()
plt.figure(figsize=[10,10])
application_df['NAME_EDUCATION_TYPE'].value_counts(normalize=True).plot.pie(title="Distribution by Education type")
plt.show()
# AMT_INCOME_TOTAL float64
# AMT_CREDIT float64
# AMT_ANNUITY float64
# AMT_GOODS_PRICE
application_df.dtypes
plt.figure(figsize=[10, 5])
plt.scatter(application_df['AMT_ANNUITY'], application_df['AMT_INCOME_TOTAL'])
plt.show()
plt.figure(figsize=[10, 5])
plt.scatter(application_df['AMT_GOODS_PRICE'], application_df['AMT_CREDIT'])
plt.show()
sns.pairplot(data=application_df, vars=['AMT_ANNUITY','AMT_GOODS_PRICE','AMT_CREDIT'])
plt.show()
application_df[['AMT_ANNUITY','AMT_GOODS_PRICE','AMT_CREDIT']].corr()
Correlation heatmap
sns.heatmap(application_df[['AMT_ANNUITY','AMT_GOODS_PRICE','AMT_CREDIT']].corr(), annot=True, cmap='Greens')
# let's analyse TARGET vs AMT_INCOME_TOTAL
application_df.groupby('TARGET')['AMT_INCOME_TOTAL'].aggregate(["mean", "median"]).plot.bar()
# parallel graph
fig = px.parallel_categories(application_df, dimensions=['NAME_CONTRACT_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS', 'TARGET'],
color="AGE", color_continuous_scale=px.colors.sequential.Inferno,
labels={'NAME_CONTRACT_TYPE':'Contract type', 'NAME_EDUCATION_TYPE':'Eductation Level','NAME_FAMILY_STATUS':'Family status', 'TARGET':'Having difficulties'})
fig.show()